import pandas as pd
import numpy as np # For mathematical calculations
import seaborn as sns # For data visualization
import matplotlib.pyplot as plt
import seaborn as sn # For plotting graphs
import io
%matplotlib inline
import warnings # To ignore any warnings
warnings.filterwarnings("ignore")
filepath2 = r"C:\Users\91623\OneDrive\Desktop\projects\COUSTOMER REALATIONPROJECT\data1.txt"
df1= pd.read_csv(filepath2,delimiter=';')
print(df1)
ID Year_Birth Education Marital_Status Income Kidhome \
0 5524 1957 Graduation Single 58138.0 0
1 2174 1954 Graduation Single 46344.0 1
2 4141 1965 Graduation Together 71613.0 0
3 6182 1984 Graduation Together 26646.0 1
4 5324 1981 PhD Married 58293.0 1
... ... ... ... ... ... ...
2235 10870 1967 Graduation Married 61223.0 0
2236 4001 1946 PhD Together 64014.0 2
2237 7270 1981 Graduation Divorced 56981.0 0
2238 8235 1956 Master Together 69245.0 0
2239 9405 1954 PhD Married 52869.0 1
Teenhome Dt_Customer Recency MntWines ... NumWebVisitsMonth \
0 0 2012-09-04 58 635 ... 7
1 1 2014-03-08 38 11 ... 5
2 0 2013-08-21 26 426 ... 4
3 0 2014-02-10 26 11 ... 6
4 0 2014-01-19 94 173 ... 5
... ... ... ... ... ... ...
2235 1 2013-06-13 46 709 ... 5
2236 1 2014-06-10 56 406 ... 7
2237 0 2014-01-25 91 908 ... 6
2238 1 2014-01-24 8 428 ... 3
2239 1 2012-10-15 40 84 ... 7
AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 \
0 0 0 0 0 0
1 0 0 0 0 0
2 0 0 0 0 0
3 0 0 0 0 0
4 0 0 0 0 0
... ... ... ... ... ...
2235 0 0 0 0 0
2236 0 0 0 1 0
2237 0 1 0 0 0
2238 0 0 0 0 0
2239 0 0 0 0 0
Complain Z_CostContact Z_Revenue Response
0 0 3 11 1
1 0 3 11 0
2 0 3 11 0
3 0 3 11 0
4 0 3 11 0
... ... ... ... ...
2235 0 3 11 0
2236 0 3 11 0
2237 0 3 11 0
2238 0 3 11 0
2239 0 3 11 1
[2240 rows x 29 columns]
df1.head()
| ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | ... | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5524 | 1957 | Graduation | Single | 58138.0 | 0 | 0 | 2012-09-04 | 58 | 635 | ... | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 1 |
| 1 | 2174 | 1954 | Graduation | Single | 46344.0 | 1 | 1 | 2014-03-08 | 38 | 11 | ... | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 2 | 4141 | 1965 | Graduation | Together | 71613.0 | 0 | 0 | 2013-08-21 | 26 | 426 | ... | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 3 | 6182 | 1984 | Graduation | Together | 26646.0 | 1 | 0 | 2014-02-10 | 26 | 11 | ... | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 4 | 5324 | 1981 | PhD | Married | 58293.0 | 1 | 0 | 2014-01-19 | 94 | 173 | ... | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
5 rows × 29 columns
df1.describe()
| ID | Year_Birth | Income | Kidhome | Teenhome | Recency | MntWines | MntFruits | MntMeatProducts | MntFishProducts | ... | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2240.000000 | 2240.000000 | 2216.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | ... | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.000000 | 2240.0 | 2240.0 | 2240.000000 |
| mean | 5592.159821 | 1968.805804 | 52247.251354 | 0.444196 | 0.506250 | 49.109375 | 303.935714 | 26.302232 | 166.950000 | 37.525446 | ... | 5.316518 | 0.072768 | 0.074554 | 0.072768 | 0.064286 | 0.013393 | 0.009375 | 3.0 | 11.0 | 0.149107 |
| std | 3246.662198 | 11.984069 | 25173.076661 | 0.538398 | 0.544538 | 28.962453 | 336.597393 | 39.773434 | 225.715373 | 54.628979 | ... | 2.426645 | 0.259813 | 0.262728 | 0.259813 | 0.245316 | 0.114976 | 0.096391 | 0.0 | 0.0 | 0.356274 |
| min | 0.000000 | 1893.000000 | 1730.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | 11.0 | 0.000000 |
| 25% | 2828.250000 | 1959.000000 | 35303.000000 | 0.000000 | 0.000000 | 24.000000 | 23.750000 | 1.000000 | 16.000000 | 3.000000 | ... | 3.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | 11.0 | 0.000000 |
| 50% | 5458.500000 | 1970.000000 | 51381.500000 | 0.000000 | 0.000000 | 49.000000 | 173.500000 | 8.000000 | 67.000000 | 12.000000 | ... | 6.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | 11.0 | 0.000000 |
| 75% | 8427.750000 | 1977.000000 | 68522.000000 | 1.000000 | 1.000000 | 74.000000 | 504.250000 | 33.000000 | 232.000000 | 50.000000 | ... | 7.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.0 | 11.0 | 0.000000 |
| max | 11191.000000 | 1996.000000 | 666666.000000 | 2.000000 | 2.000000 | 99.000000 | 1493.000000 | 199.000000 | 1725.000000 | 259.000000 | ... | 20.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 3.0 | 11.0 | 1.000000 |
8 rows × 26 columns
df1.shape
(2240, 29)
df1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2240 entries, 0 to 2239 Data columns (total 29 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 2240 non-null int64 1 Year_Birth 2240 non-null int64 2 Education 2240 non-null object 3 Marital_Status 2240 non-null object 4 Income 2216 non-null float64 5 Kidhome 2240 non-null int64 6 Teenhome 2240 non-null int64 7 Dt_Customer 2240 non-null object 8 Recency 2240 non-null int64 9 MntWines 2240 non-null int64 10 MntFruits 2240 non-null int64 11 MntMeatProducts 2240 non-null int64 12 MntFishProducts 2240 non-null int64 13 MntSweetProducts 2240 non-null int64 14 MntGoldProds 2240 non-null int64 15 NumDealsPurchases 2240 non-null int64 16 NumWebPurchases 2240 non-null int64 17 NumCatalogPurchases 2240 non-null int64 18 NumStorePurchases 2240 non-null int64 19 NumWebVisitsMonth 2240 non-null int64 20 AcceptedCmp3 2240 non-null int64 21 AcceptedCmp4 2240 non-null int64 22 AcceptedCmp5 2240 non-null int64 23 AcceptedCmp1 2240 non-null int64 24 AcceptedCmp2 2240 non-null int64 25 Complain 2240 non-null int64 26 Z_CostContact 2240 non-null int64 27 Z_Revenue 2240 non-null int64 28 Response 2240 non-null int64 dtypes: float64(1), int64(25), object(3) memory usage: 507.6+ KB
Attributes
People:-
ID: Customer's unique identifier Year_Birth: Customer's birth year Education: Customer's education level Marital_Status: Customer's marital status Income: Customer's yearly household income Kidhome: Number of children in customer's household Teenhome: Number of teenagers in customer's household Dt_Customer: Date of customer's enrollment with the company Recency: Number of days since customer's last purchase Complain: 1 if the customer complained in the last 2 years, 0 otherwise
Products:-
MntWines: Amount spent on wine in last 2 years MntFruits: Amount spent on fruits in last 2 years MntMeatProducts: Amount spent on meat in last 2 years MntFishProducts: Amount spent on fish in last 2 years MntSweetProducts: Amount spent on sweets in last 2 years MntGoldProds: Amount spent on gold in last 2 years
Promotion:-
NumDealsPurchases: Number of purchases made with a discount AcceptedCmp1: 1 if customer accepted the offer in the 1st campaign, 0 otherwise AcceptedCmp2: 1 if customer accepted the offer in the 2nd campaign, 0 otherwise AcceptedCmp3: 1 if customer accepted the offer in the 3rd campaign, 0 otherwise AcceptedCmp4: 1 if customer accepted the offer in the 4th campaign, 0 otherwise AcceptedCmp5: 1 if customer accepted the offer in the 5th campaign, 0 otherwise Response: 1 if customer accepted the offer in the last campaign, 0 otherwise
Place:-
NumWebPurchases: Number of purchases made through the company’s website NumCatalogPurchases: Number of purchases made using a catalogue NumStorePurchases: Number of purchases made directly in stores NumWebVisitsMonth: Number of visits to company’s website in the last month
df=df1.drop('ID',axis=1)
df.head()
| Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | MntFruits | ... | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1957 | Graduation | Single | 58138.0 | 0 | 0 | 2012-09-04 | 58 | 635 | 88 | ... | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 1 |
| 1 | 1954 | Graduation | Single | 46344.0 | 1 | 1 | 2014-03-08 | 38 | 11 | 1 | ... | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 2 | 1965 | Graduation | Together | 71613.0 | 0 | 0 | 2013-08-21 | 26 | 426 | 49 | ... | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 3 | 1984 | Graduation | Together | 26646.0 | 1 | 0 | 2014-02-10 | 26 | 11 | 4 | ... | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
| 4 | 1981 | PhD | Married | 58293.0 | 1 | 0 | 2014-01-19 | 94 | 173 | 43 | ... | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
5 rows × 28 columns
unique = [feature for feature in df.columns if len(df[feature].unique())>0 and len(df[feature].unique())<100]
for feature in unique:
print("{} has {} unique values : {} {}".format(feature,len(df[feature].unique()),df[feature].unique(),"\n"))
Year_Birth has 59 unique values : [1957 1954 1965 1984 1981 1967 1971 1985 1974 1950 1983 1976 1959 1952 1987 1946 1980 1949 1982 1979 1951 1969 1986 1989 1963 1970 1973 1943 1975 1996 1968 1964 1977 1978 1955 1966 1988 1948 1958 1972 1960 1945 1991 1962 1953 1961 1956 1992 1900 1893 1990 1947 1899 1993 1994 1941 1944 1995 1940] Education has 5 unique values : ['Graduation' 'PhD' 'Master' 'Basic' '2n Cycle'] Marital_Status has 8 unique values : ['Single' 'Together' 'Married' 'Divorced' 'Widow' 'Alone' 'Absurd' 'YOLO'] Kidhome has 3 unique values : [0 1 2] Teenhome has 3 unique values : [0 1 2] NumDealsPurchases has 15 unique values : [ 3 2 1 5 4 15 7 0 6 9 12 8 10 13 11] NumWebPurchases has 15 unique values : [ 8 1 2 5 6 7 4 3 11 0 27 10 9 23 25] NumCatalogPurchases has 14 unique values : [10 1 2 0 3 4 6 28 9 5 8 7 11 22] NumStorePurchases has 14 unique values : [ 4 2 10 6 7 0 3 8 5 12 9 13 11 1] NumWebVisitsMonth has 16 unique values : [ 7 5 4 6 8 9 20 2 3 1 10 0 14 19 17 13] AcceptedCmp3 has 2 unique values : [0 1] AcceptedCmp4 has 2 unique values : [0 1] AcceptedCmp5 has 2 unique values : [0 1] AcceptedCmp1 has 2 unique values : [0 1] AcceptedCmp2 has 2 unique values : [0 1] Complain has 2 unique values : [0 1] Z_CostContact has 1 unique values : [3] Z_Revenue has 1 unique values : [11] Response has 2 unique values : [1 0]
df.columns = df.columns.str.lower()
df.isnull().sum()
year_birth 0 education 0 marital_status 0 income 24 kidhome 0 teenhome 0 dt_customer 0 recency 0 mntwines 0 mntfruits 0 mntmeatproducts 0 mntfishproducts 0 mntsweetproducts 0 mntgoldprods 0 numdealspurchases 0 numwebpurchases 0 numcatalogpurchases 0 numstorepurchases 0 numwebvisitsmonth 0 acceptedcmp3 0 acceptedcmp4 0 acceptedcmp5 0 acceptedcmp1 0 acceptedcmp2 0 complain 0 z_costcontact 0 z_revenue 0 response 0 dtype: int64
df= df.dropna()
df.isnull().sum()
year_birth 0 education 0 marital_status 0 income 0 kidhome 0 teenhome 0 dt_customer 0 recency 0 mntwines 0 mntfruits 0 mntmeatproducts 0 mntfishproducts 0 mntsweetproducts 0 mntgoldprods 0 numdealspurchases 0 numwebpurchases 0 numcatalogpurchases 0 numstorepurchases 0 numwebvisitsmonth 0 acceptedcmp3 0 acceptedcmp4 0 acceptedcmp5 0 acceptedcmp1 0 acceptedcmp2 0 complain 0 z_costcontact 0 z_revenue 0 response 0 dtype: int64
df.shape
(2216, 28)
df['total_kids'] = df['kidhome']+df['teenhome']
df.head()
| year_birth | education | marital_status | income | kidhome | teenhome | dt_customer | recency | mntwines | mntfruits | ... | acceptedcmp3 | acceptedcmp4 | acceptedcmp5 | acceptedcmp1 | acceptedcmp2 | complain | z_costcontact | z_revenue | response | total_kids | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1957 | Graduation | Single | 58138.0 | 0 | 0 | 2012-09-04 | 58 | 635 | 88 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 1 | 0 |
| 1 | 1954 | Graduation | Single | 46344.0 | 1 | 1 | 2014-03-08 | 38 | 11 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 2 |
| 2 | 1965 | Graduation | Together | 71613.0 | 0 | 0 | 2013-08-21 | 26 | 426 | 49 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 0 |
| 3 | 1984 | Graduation | Together | 26646.0 | 1 | 0 | 2014-02-10 | 26 | 11 | 4 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 1 |
| 4 | 1981 | PhD | Married | 58293.0 | 1 | 0 | 2014-01-19 | 94 | 173 | 43 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 1 |
5 rows × 29 columns
sns.countplot(df['total_kids'],palette='dark:r');
df['total_accep'] = df['acceptedcmp1'] + df['acceptedcmp2']+df['acceptedcmp3'] + df['acceptedcmp4'] +df['acceptedcmp5'] +df['response']
df['total_purchase'] = df['numdealspurchases']+df['numwebpurchases']+df['numcatalogpurchases']+df['numstorepurchases']
df.head()
| year_birth | education | marital_status | income | kidhome | teenhome | dt_customer | recency | mntwines | mntfruits | ... | acceptedcmp5 | acceptedcmp1 | acceptedcmp2 | complain | z_costcontact | z_revenue | response | total_kids | total_accep | total_purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1957 | Graduation | Single | 58138.0 | 0 | 0 | 2012-09-04 | 58 | 635 | 88 | ... | 0 | 0 | 0 | 0 | 3 | 11 | 1 | 0 | 1 | 25 |
| 1 | 1954 | Graduation | Single | 46344.0 | 1 | 1 | 2014-03-08 | 38 | 11 | 1 | ... | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 2 | 0 | 6 |
| 2 | 1965 | Graduation | Together | 71613.0 | 0 | 0 | 2013-08-21 | 26 | 426 | 49 | ... | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 0 | 0 | 21 |
| 3 | 1984 | Graduation | Together | 26646.0 | 1 | 0 | 2014-02-10 | 26 | 11 | 4 | ... | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 1 | 0 | 8 |
| 4 | 1981 | PhD | Married | 58293.0 | 1 | 0 | 2014-01-19 | 94 | 173 | 43 | ... | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 1 | 0 | 19 |
5 rows × 31 columns
df.drop(['kidhome','teenhome','numdealspurchases','numwebpurchases','numcatalogpurchases','numstorepurchases','numwebvisitsmonth','acceptedcmp3',
'acceptedcmp4','acceptedcmp5','acceptedcmp1','acceptedcmp2','complain','z_costcontact','z_revenue','response'],axis=1,inplace=True)
df.head()
| year_birth | education | marital_status | income | dt_customer | recency | mntwines | mntfruits | mntmeatproducts | mntfishproducts | mntsweetproducts | mntgoldprods | total_kids | total_accep | total_purchase | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1957 | Graduation | Single | 58138.0 | 2012-09-04 | 58 | 635 | 88 | 546 | 172 | 88 | 88 | 0 | 1 | 25 |
| 1 | 1954 | Graduation | Single | 46344.0 | 2014-03-08 | 38 | 11 | 1 | 6 | 2 | 1 | 6 | 2 | 0 | 6 |
| 2 | 1965 | Graduation | Together | 71613.0 | 2013-08-21 | 26 | 426 | 49 | 127 | 111 | 21 | 42 | 0 | 0 | 21 |
| 3 | 1984 | Graduation | Together | 26646.0 | 2014-02-10 | 26 | 11 | 4 | 20 | 10 | 3 | 5 | 1 | 0 | 8 |
| 4 | 1981 | PhD | Married | 58293.0 | 2014-01-19 | 94 | 173 | 43 | 118 | 46 | 27 | 15 | 1 | 0 | 19 |
df['age'] = 2014-df['year_birth']
df.head()
| year_birth | education | marital_status | income | dt_customer | recency | mntwines | mntfruits | mntmeatproducts | mntfishproducts | mntsweetproducts | mntgoldprods | total_kids | total_accep | total_purchase | age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1957 | Graduation | Single | 58138.0 | 2012-09-04 | 58 | 635 | 88 | 546 | 172 | 88 | 88 | 0 | 1 | 25 | 57 |
| 1 | 1954 | Graduation | Single | 46344.0 | 2014-03-08 | 38 | 11 | 1 | 6 | 2 | 1 | 6 | 2 | 0 | 6 | 60 |
| 2 | 1965 | Graduation | Together | 71613.0 | 2013-08-21 | 26 | 426 | 49 | 127 | 111 | 21 | 42 | 0 | 0 | 21 | 49 |
| 3 | 1984 | Graduation | Together | 26646.0 | 2014-02-10 | 26 | 11 | 4 | 20 | 10 | 3 | 5 | 1 | 0 | 8 | 30 |
| 4 | 1981 | PhD | Married | 58293.0 | 2014-01-19 | 94 | 173 | 43 | 118 | 46 | 27 | 15 | 1 | 0 | 19 | 33 |
df['marital_status'] = df['marital_status'].replace({"Together":"Married", "Absurd":"Single", "Widow":"Single", "YOLO":"Single", "Divorced":"Single", "Alone":"Single",})
df
| year_birth | education | marital_status | income | dt_customer | recency | mntwines | mntfruits | mntmeatproducts | mntfishproducts | mntsweetproducts | mntgoldprods | total_kids | total_accep | total_purchase | age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1957 | Graduation | Single | 58138.0 | 2012-09-04 | 58 | 635 | 88 | 546 | 172 | 88 | 88 | 0 | 1 | 25 | 57 |
| 1 | 1954 | Graduation | Single | 46344.0 | 2014-03-08 | 38 | 11 | 1 | 6 | 2 | 1 | 6 | 2 | 0 | 6 | 60 |
| 2 | 1965 | Graduation | Married | 71613.0 | 2013-08-21 | 26 | 426 | 49 | 127 | 111 | 21 | 42 | 0 | 0 | 21 | 49 |
| 3 | 1984 | Graduation | Married | 26646.0 | 2014-02-10 | 26 | 11 | 4 | 20 | 10 | 3 | 5 | 1 | 0 | 8 | 30 |
| 4 | 1981 | PhD | Married | 58293.0 | 2014-01-19 | 94 | 173 | 43 | 118 | 46 | 27 | 15 | 1 | 0 | 19 | 33 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2235 | 1967 | Graduation | Married | 61223.0 | 2013-06-13 | 46 | 709 | 43 | 182 | 42 | 118 | 247 | 1 | 0 | 18 | 47 |
| 2236 | 1946 | PhD | Married | 64014.0 | 2014-06-10 | 56 | 406 | 0 | 30 | 0 | 0 | 8 | 3 | 1 | 22 | 68 |
| 2237 | 1981 | Graduation | Single | 56981.0 | 2014-01-25 | 91 | 908 | 48 | 217 | 32 | 12 | 24 | 0 | 1 | 19 | 33 |
| 2238 | 1956 | Master | Married | 69245.0 | 2014-01-24 | 8 | 428 | 30 | 214 | 80 | 30 | 61 | 1 | 0 | 23 | 58 |
| 2239 | 1954 | PhD | Married | 52869.0 | 2012-10-15 | 40 | 84 | 3 | 61 | 2 | 1 | 21 | 2 | 1 | 11 | 60 |
2216 rows × 16 columns
print("marital_status :",df.marital_status.unique())
marital_status : ['Single' 'Married']
df['education'] = df['education'].replace({"Basic":"Undergraduate","2n Cycle":"Undergraduate", "Graduation":"Graduate", "Master":"Postgraduate", "PhD":"Postgraduate"})
print("education:",df.education.unique())
education: ['Graduate' 'Postgraduate' 'Undergraduate']
df=df.drop("year_birth",axis=1)
df['isparent'] = df['total_kids']>0
df.head()
| education | marital_status | income | dt_customer | recency | mntwines | mntfruits | mntmeatproducts | mntfishproducts | mntsweetproducts | mntgoldprods | total_kids | total_accep | total_purchase | age | isparent | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Graduate | Single | 58138.0 | 2012-09-04 | 58 | 635 | 88 | 546 | 172 | 88 | 88 | 0 | 1 | 25 | 57 | False |
| 1 | Graduate | Single | 46344.0 | 2014-03-08 | 38 | 11 | 1 | 6 | 2 | 1 | 6 | 2 | 0 | 6 | 60 | True |
| 2 | Graduate | Married | 71613.0 | 2013-08-21 | 26 | 426 | 49 | 127 | 111 | 21 | 42 | 0 | 0 | 21 | 49 | False |
| 3 | Graduate | Married | 26646.0 | 2014-02-10 | 26 | 11 | 4 | 20 | 10 | 3 | 5 | 1 | 0 | 8 | 30 | True |
| 4 | Postgraduate | Married | 58293.0 | 2014-01-19 | 94 | 173 | 43 | 118 | 46 | 27 | 15 | 1 | 0 | 19 | 33 | True |
df["spent"] = df["mntwines"]+ df["mntfruits"]+ df["mntmeatproducts"]+ df["mntfishproducts"]+ df["mntsweetproducts"]+df["mntgoldprods"]
df.head()
| education | marital_status | income | dt_customer | recency | mntwines | mntfruits | mntmeatproducts | mntfishproducts | mntsweetproducts | mntgoldprods | total_kids | total_accep | total_purchase | age | isparent | spent | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Graduate | Single | 58138.0 | 2012-09-04 | 58 | 635 | 88 | 546 | 172 | 88 | 88 | 0 | 1 | 25 | 57 | False | 1617 |
| 1 | Graduate | Single | 46344.0 | 2014-03-08 | 38 | 11 | 1 | 6 | 2 | 1 | 6 | 2 | 0 | 6 | 60 | True | 27 |
| 2 | Graduate | Married | 71613.0 | 2013-08-21 | 26 | 426 | 49 | 127 | 111 | 21 | 42 | 0 | 0 | 21 | 49 | False | 776 |
| 3 | Graduate | Married | 26646.0 | 2014-02-10 | 26 | 11 | 4 | 20 | 10 | 3 | 5 | 1 | 0 | 8 | 30 | True | 53 |
| 4 | Postgraduate | Married | 58293.0 | 2014-01-19 | 94 | 173 | 43 | 118 | 46 | 27 | 15 | 1 | 0 | 19 | 33 | True | 422 |
df1=df
df1= df1.drop(["mntwines","mntfruits","mntmeatproducts","mntfishproducts","mntsweetproducts","mntgoldprods"],axis=1)
df1
| education | marital_status | income | dt_customer | recency | total_kids | total_accep | total_purchase | age | isparent | spent | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Graduate | Single | 58138.0 | 2012-09-04 | 58 | 0 | 1 | 25 | 57 | False | 1617 |
| 1 | Graduate | Single | 46344.0 | 2014-03-08 | 38 | 2 | 0 | 6 | 60 | True | 27 |
| 2 | Graduate | Married | 71613.0 | 2013-08-21 | 26 | 0 | 0 | 21 | 49 | False | 776 |
| 3 | Graduate | Married | 26646.0 | 2014-02-10 | 26 | 1 | 0 | 8 | 30 | True | 53 |
| 4 | Postgraduate | Married | 58293.0 | 2014-01-19 | 94 | 1 | 0 | 19 | 33 | True | 422 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2235 | Graduate | Married | 61223.0 | 2013-06-13 | 46 | 1 | 0 | 18 | 47 | True | 1341 |
| 2236 | Postgraduate | Married | 64014.0 | 2014-06-10 | 56 | 3 | 1 | 22 | 68 | True | 444 |
| 2237 | Graduate | Single | 56981.0 | 2014-01-25 | 91 | 0 | 1 | 19 | 33 | False | 1241 |
| 2238 | Postgraduate | Married | 69245.0 | 2014-01-24 | 8 | 1 | 0 | 23 | 58 | True | 843 |
| 2239 | Postgraduate | Married | 52869.0 | 2012-10-15 | 40 | 2 | 1 | 11 | 60 | True | 172 |
2216 rows × 11 columns
df1= df1.drop(["dt_customer","recency"],axis=1)
df1
| education | marital_status | income | total_kids | total_accep | total_purchase | age | isparent | spent | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Graduate | Single | 58138.0 | 0 | 1 | 25 | 57 | False | 1617 |
| 1 | Graduate | Single | 46344.0 | 2 | 0 | 6 | 60 | True | 27 |
| 2 | Graduate | Married | 71613.0 | 0 | 0 | 21 | 49 | False | 776 |
| 3 | Graduate | Married | 26646.0 | 1 | 0 | 8 | 30 | True | 53 |
| 4 | Postgraduate | Married | 58293.0 | 1 | 0 | 19 | 33 | True | 422 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2235 | Graduate | Married | 61223.0 | 1 | 0 | 18 | 47 | True | 1341 |
| 2236 | Postgraduate | Married | 64014.0 | 3 | 1 | 22 | 68 | True | 444 |
| 2237 | Graduate | Single | 56981.0 | 0 | 1 | 19 | 33 | False | 1241 |
| 2238 | Postgraduate | Married | 69245.0 | 1 | 0 | 23 | 58 | True | 843 |
| 2239 | Postgraduate | Married | 52869.0 | 2 | 1 | 11 | 60 | True | 172 |
2216 rows × 9 columns
df["dt_customer"] = pd.to_datetime(df["dt_customer"]).dt.date
print("The newest date is:",max(df["dt_customer"]))
print("The oldest date is:",min(df["dt_customer"]))
import requests
from datetime import date
days = []
d1 = max(df['dt_customer'])
for i in df['dt_customer']:
delta = d1 - i
days.append(delta.days)
df["total_days"] = days
df["total_days"] = pd.to_numeric(df["total_days"], errors="coerce")
The newest date is: 2014-06-29 The oldest date is: 2012-07-30
from pandas_profiling import ProfileReport
cra = ProfileReport(df1)
cra.to_file(output_file="cra_report.html")
df.head()
| education | marital_status | income | dt_customer | recency | mntwines | mntfruits | mntmeatproducts | mntfishproducts | mntsweetproducts | mntgoldprods | total_kids | total_accep | total_purchase | age | isparent | spent | total_days | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Graduate | Single | 58138.0 | 2012-09-04 | 58 | 635 | 88 | 546 | 172 | 88 | 88 | 0 | 1 | 25 | 57 | False | 1617 | 663 |
| 1 | Graduate | Single | 46344.0 | 2014-03-08 | 38 | 11 | 1 | 6 | 2 | 1 | 6 | 2 | 0 | 6 | 60 | True | 27 | 113 |
| 2 | Graduate | Married | 71613.0 | 2013-08-21 | 26 | 426 | 49 | 127 | 111 | 21 | 42 | 0 | 0 | 21 | 49 | False | 776 | 312 |
| 3 | Graduate | Married | 26646.0 | 2014-02-10 | 26 | 11 | 4 | 20 | 10 | 3 | 5 | 1 | 0 | 8 | 30 | True | 53 | 139 |
| 4 | Postgraduate | Married | 58293.0 | 2014-01-19 | 94 | 173 | 43 | 118 | 46 | 27 | 15 | 1 | 0 | 19 | 33 | True | 422 | 161 |
numerical_features = [feature for feature in df.columns if df[feature].dtype != "O"]
print("we have {} numerical features out of {}".format(len(numerical_features),len(df.columns)))
we have 15 numerical features out of 18
discrete_features = [feature for feature in numerical_features if len(df[feature].unique())<25]
print("we have {} discrete features out of {} total features".format(len(discrete_features),len(df.columns)))
we have 3 discrete features out of 18 total features
df[discrete_features].head()
| total_kids | total_accep | isparent | |
|---|---|---|---|
| 0 | 0 | 1 | False |
| 1 | 2 | 0 | True |
| 2 | 0 | 0 | False |
| 3 | 1 | 0 | True |
| 4 | 1 | 0 | True |
fig,ax = plt.subplots(4,4,figsize=(20,15))
for variable, subplot in zip(discrete_features,ax.flatten()):
sns.countplot(df[variable],ax=subplot,palette="winter")
continuos_features = [feature for feature in numerical_features if feature not in discrete_features]
print("we have {} continuos features out of {} total features".format(len(continuos_features),len(df.columns)))
we have 12 continuos features out of 18 total features
fig,ax = plt.subplots(3,3,figsize=(20,12))
for variable, subplot in zip(continuos_features,ax.flatten()):
sns.histplot(df[variable],ax=subplot)
plt.figure(figsize=(20,5))
plt.subplot(1,2,1)
sns.histplot(df['age'],color='red')
plt.title('Distribution of age')
plt.subplot(1,2,2)
sns.histplot(df['spent'],color='blue',bins=10)
plt.title('Distribution of total_amount_spent');
catagorical values
categorical_features = [feature for feature in df.columns if df[feature].dtype == "O"]
print("we have {} continuos features out of {} features".format(len(categorical_features),len(df.columns)))
we have 3 continuos features out of 18 features
df[categorical_features].head()
| education | marital_status | dt_customer | |
|---|---|---|---|
| 0 | Graduate | Single | 2012-09-04 |
| 1 | Graduate | Single | 2014-03-08 |
| 2 | Graduate | Married | 2013-08-21 |
| 3 | Graduate | Married | 2014-02-10 |
| 4 | Postgraduate | Married | 2014-01-19 |
from datetime import date
days = []
d1 = max(df['dt_customer'])
for i in df['dt_customer']:
delta = d1 - i
days.append(delta.days)
df["total_days"] = days
df["total_days"] = pd.to_numeric(df["total_days"], errors="coerce")
plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
sns.countplot(df['marital_status'],palette ='spring')
plt.title('Distribution of marital status')
plt.subplot(1,2,2)
sns.countplot(df['education'],palette ='winter')
plt.title('Distribution of Education');
df[categorical_features].head()
| education | marital_status | dt_customer | |
|---|---|---|---|
| 0 | Graduate | Single | 2012-09-04 |
| 1 | Graduate | Single | 2014-03-08 |
| 2 | Graduate | Married | 2013-08-21 |
| 3 | Graduate | Married | 2014-02-10 |
| 4 | Postgraduate | Married | 2014-01-19 |
plt.figure(figsize=(14,5))
sns.countplot(df1['isparent'],palette='ocean_r')
plt.title('distribution of is parent');
df['family'] = df['marital_status'].replace({'Single':1,"Married":2}) + df['total_kids']
sns.countplot(df['family'],palette='spring')
plt.title('distribution of family')
Text(0.5, 1.0, 'distribution of family')
sns.scatterplot(data =df['age'])
plt.title('scatterplot of age');
df[df['age']>90]
| education | marital_status | income | dt_customer | recency | mntwines | mntfruits | mntmeatproducts | mntfishproducts | mntsweetproducts | mntgoldprods | total_kids | total_accep | total_purchase | age | isparent | spent | total_days | family | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 192 | Undergraduate | Single | 36640.0 | 2013-09-26 | 99 | 15 | 6 | 8 | 7 | 4 | 25 | 1 | 0 | 6 | 114 | True | 65 | 276 | 2 |
| 239 | Undergraduate | Single | 60182.0 | 2014-05-17 | 23 | 8 | 0 | 5 | 7 | 0 | 2 | 1 | 0 | 4 | 121 | True | 22 | 43 | 2 |
| 339 | Postgraduate | Married | 83532.0 | 2013-09-26 | 36 | 755 | 144 | 562 | 104 | 64 | 224 | 0 | 1 | 15 | 115 | False | 1853 | 276 | 2 |
df1
| education | marital_status | income | total_kids | total_accep | total_purchase | age | isparent | spent | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Graduate | Single | 58138.0 | 0 | 1 | 25 | 57 | False | 1617 |
| 1 | Graduate | Single | 46344.0 | 2 | 0 | 6 | 60 | True | 27 |
| 2 | Graduate | Married | 71613.0 | 0 | 0 | 21 | 49 | False | 776 |
| 3 | Graduate | Married | 26646.0 | 1 | 0 | 8 | 30 | True | 53 |
| 4 | Postgraduate | Married | 58293.0 | 1 | 0 | 19 | 33 | True | 422 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2235 | Graduate | Married | 61223.0 | 1 | 0 | 18 | 47 | True | 1341 |
| 2236 | Postgraduate | Married | 64014.0 | 3 | 1 | 22 | 68 | True | 444 |
| 2237 | Graduate | Single | 56981.0 | 0 | 1 | 19 | 33 | False | 1241 |
| 2238 | Postgraduate | Married | 69245.0 | 1 | 0 | 23 | 58 | True | 843 |
| 2239 | Postgraduate | Married | 52869.0 | 2 | 1 | 11 | 60 | True | 172 |
2216 rows × 9 columns
df= df.drop(["dt_customer"],axis=1)
sns.boxplot(df['age'])
plt.title("Age- Before outlier removal");
df = df.drop(df[df['age']>90].index)
sns.boxplot(df['age'],color='g')
plt.title('Age- After outlier removal')
Text(0.5, 1.0, 'Age- After outlier removal')
sns.scatterplot(data = df['income'])
plt.title('scatterplot of income');
sns.boxplot(df['income'])
plt.title("income- Before outlier removal");
df = df.drop(df[df['income']>600000].index)
sns.boxplot(df['income'])
<AxesSubplot:xlabel='income'>
df1 = df1.drop(df1[df1['income']>600000].index)
df1 = df1.drop(df1[df1['age']>90].index)
df1.head()
| education | marital_status | income | total_kids | total_accep | total_purchase | age | isparent | spent | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Graduate | Single | 58138.0 | 0 | 1 | 25 | 57 | False | 1617 |
| 1 | Graduate | Single | 46344.0 | 2 | 0 | 6 | 60 | True | 27 |
| 2 | Graduate | Married | 71613.0 | 0 | 0 | 21 | 49 | False | 776 |
| 3 | Graduate | Married | 26646.0 | 1 | 0 | 8 | 30 | True | 53 |
| 4 | Postgraduate | Married | 58293.0 | 1 | 0 | 19 | 33 | True | 422 |
sns.scatterplot(data = df1['spent'])
plt.title('scatterplot of spent');
df1
| education | marital_status | income | total_kids | total_accep | total_purchase | age | isparent | spent | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Graduate | Single | 58138.0 | 0 | 1 | 25 | 57 | False | 1617 |
| 1 | Graduate | Single | 46344.0 | 2 | 0 | 6 | 60 | True | 27 |
| 2 | Graduate | Married | 71613.0 | 0 | 0 | 21 | 49 | False | 776 |
| 3 | Graduate | Married | 26646.0 | 1 | 0 | 8 | 30 | True | 53 |
| 4 | Postgraduate | Married | 58293.0 | 1 | 0 | 19 | 33 | True | 422 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2235 | Graduate | Married | 61223.0 | 1 | 0 | 18 | 47 | True | 1341 |
| 2236 | Postgraduate | Married | 64014.0 | 3 | 1 | 22 | 68 | True | 444 |
| 2237 | Graduate | Single | 56981.0 | 0 | 1 | 19 | 33 | False | 1241 |
| 2238 | Postgraduate | Married | 69245.0 | 1 | 0 | 23 | 58 | True | 843 |
| 2239 | Postgraduate | Married | 52869.0 | 2 | 1 | 11 | 60 | True | 172 |
2212 rows × 9 columns
sns.histplot(df1['income'],color='red',label='income')
sns.histplot(df1['spent'],label='spent')
plt.legend()
plt.title('Histogram of income and spent')
Text(0.5, 1.0, 'Histogram of income and spent')
import plotly.express as px
import plotly.graph_objects as go
fig = px.scatter(df1, x="total_purchase", y="income", color="marital_status",
size="spent", size_max=45, log_x=True)
fig.update_layout(legend=dict(
yanchor="top",
y=0.99,
xanchor="left",
x=0.01
))
fig.show()
fig = px.sunburst(df1, path=['education', 'marital_status','total_purchase'], branchvalues='total',color='total_purchase',template='plotly',title='total purchase')
fig.show()
corrmat=df.corr()
top_corr=corrmat.index
plt.figure(figsize=(20,20))
g= sns.heatmap(df[top_corr].corr(),annot= True,cmap="RdYlGn")
corrmat=df1.corr()
top_corr=corrmat.index
plt.figure(figsize=(20,20))
g= sns.heatmap(df[top_corr].corr(),annot= True,cmap="RdYlGn")
pos_corrmat = df.corr()
filtereddf = pos_corrmat[((pos_corrmat >= .5) & (pos_corrmat !=1.000))]
plt.figure(figsize=(15,5))
sns.heatmap(filtereddf, annot=True, cmap="Reds")
plt.title('Positive correlation matrix')
plt.show()
neg_corrmat = df.corr()
filtereddf = neg_corrmat[((neg_corrmat <-.3) & (neg_corrmat !=1.000))]
plt.figure(figsize=(15,5))
sns.heatmap(filtereddf, annot=True, cmap="Reds")
plt.title('Negative correlation matrix')
plt.show()
pos_corrmat = df1.corr()
filtereddf = pos_corrmat[((pos_corrmat >= .5) & (pos_corrmat !=1.000))]
plt.figure(figsize=(15,5))
sns.heatmap(filtereddf, annot=True, cmap="Reds")
plt.title('Positive correlation matrix')
plt.show()
neg_corrmat = df1.corr()
filtereddf = neg_corrmat[((neg_corrmat <-.3) & (neg_corrmat !=1.000))]
plt.figure(figsize=(15,5))
sns.heatmap(filtereddf, annot=True, cmap="Reds")
plt.title('Negative correlation matrix')
plt.show()
sns.pairplot(df,hue='income')
<seaborn.axisgrid.PairGrid at 0x2508b3651c0>
sns.pairplot(df1,hue='income')
<seaborn.axisgrid.PairGrid at 0x25096aedf70>
df.loc[(df['age'] >= 13) & (df['age'] <= 19), 'ageGroup'] = 'Teen'
df.loc[(df['age'] >= 20) & (df['age']<= 39), 'ageGroup'] = 'Adult'
df.loc[(df['age'] >= 40) & (df['age'] <= 59), 'ageGroup'] = 'Middle Age Adult'
df.loc[(df['age'] > 60), 'ageGroup'] = 'Senior Adult'
df
| education | marital_status | income | recency | mntwines | mntfruits | mntmeatproducts | mntfishproducts | mntsweetproducts | mntgoldprods | total_kids | total_accep | total_purchase | age | isparent | spent | total_days | family | ageGroup | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Graduate | Single | 58138.0 | 58 | 635 | 88 | 546 | 172 | 88 | 88 | 0 | 1 | 25 | 57 | False | 1617 | 663 | 1 | Middle Age Adult |
| 1 | Graduate | Single | 46344.0 | 38 | 11 | 1 | 6 | 2 | 1 | 6 | 2 | 0 | 6 | 60 | True | 27 | 113 | 3 | NaN |
| 2 | Graduate | Married | 71613.0 | 26 | 426 | 49 | 127 | 111 | 21 | 42 | 0 | 0 | 21 | 49 | False | 776 | 312 | 2 | Middle Age Adult |
| 3 | Graduate | Married | 26646.0 | 26 | 11 | 4 | 20 | 10 | 3 | 5 | 1 | 0 | 8 | 30 | True | 53 | 139 | 3 | Adult |
| 4 | Postgraduate | Married | 58293.0 | 94 | 173 | 43 | 118 | 46 | 27 | 15 | 1 | 0 | 19 | 33 | True | 422 | 161 | 3 | Adult |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2235 | Graduate | Married | 61223.0 | 46 | 709 | 43 | 182 | 42 | 118 | 247 | 1 | 0 | 18 | 47 | True | 1341 | 381 | 3 | Middle Age Adult |
| 2236 | Postgraduate | Married | 64014.0 | 56 | 406 | 0 | 30 | 0 | 0 | 8 | 3 | 1 | 22 | 68 | True | 444 | 19 | 5 | Senior Adult |
| 2237 | Graduate | Single | 56981.0 | 91 | 908 | 48 | 217 | 32 | 12 | 24 | 0 | 1 | 19 | 33 | False | 1241 | 155 | 1 | Adult |
| 2238 | Postgraduate | Married | 69245.0 | 8 | 428 | 30 | 214 | 80 | 30 | 61 | 1 | 0 | 23 | 58 | True | 843 | 156 | 3 | Middle Age Adult |
| 2239 | Postgraduate | Married | 52869.0 | 40 | 84 | 3 | 61 | 2 | 1 | 21 | 2 | 1 | 11 | 60 | True | 172 | 622 | 4 | NaN |
2212 rows × 19 columns
STANDARDIZE
object_feat = [feature for feature in df1.columns if df1[feature].dtype == "O"]
df1[object_feat].head()
| education | marital_status | |
|---|---|---|
| 0 | Graduate | Single |
| 1 | Graduate | Single |
| 2 | Graduate | Married |
| 3 | Graduate | Married |
| 4 | Postgraduate | Married |
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in object_feat:
df1[i] = df1[[i]].apply(le.fit_transform)
df1.head()
| education | marital_status | income | total_kids | total_accep | total_purchase | age | isparent | spent | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 58138.0 | 0 | 1 | 25 | 57 | False | 1617 |
| 1 | 0 | 1 | 46344.0 | 2 | 0 | 6 | 60 | True | 27 |
| 2 | 0 | 0 | 71613.0 | 0 | 0 | 21 | 49 | False | 776 |
| 3 | 0 | 0 | 26646.0 | 1 | 0 | 8 | 30 | True | 53 |
| 4 | 1 | 0 | 58293.0 | 1 | 0 | 19 | 33 | True | 422 |
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df2 = scaler.fit_transform(df1)
df2 = pd.DataFrame(df2,columns = df1.columns)
df1
| education | marital_status | income | total_kids | total_accep | total_purchase | age | isparent | spent | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 58138.0 | 0 | 1 | 25 | 57 | False | 1617 |
| 1 | 0 | 1 | 46344.0 | 2 | 0 | 6 | 60 | True | 27 |
| 2 | 0 | 0 | 71613.0 | 0 | 0 | 21 | 49 | False | 776 |
| 3 | 0 | 0 | 26646.0 | 1 | 0 | 8 | 30 | True | 53 |
| 4 | 1 | 0 | 58293.0 | 1 | 0 | 19 | 33 | True | 422 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2235 | 0 | 0 | 61223.0 | 1 | 0 | 18 | 47 | True | 1341 |
| 2236 | 1 | 0 | 64014.0 | 3 | 1 | 22 | 68 | True | 444 |
| 2237 | 0 | 1 | 56981.0 | 0 | 1 | 19 | 33 | False | 1241 |
| 2238 | 1 | 0 | 69245.0 | 1 | 0 | 23 | 58 | True | 843 |
| 2239 | 1 | 0 | 52869.0 | 2 | 1 | 11 | 60 | True | 172 |
2212 rows × 9 columns
scaled_df= df2.copy()
df2
| education | marital_status | income | total_kids | total_accep | total_purchase | age | isparent | spent | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.893586 | 1.349603 | 0.287105 | -1.264598 | 0.617244 | 1.317945 | 1.018352 | -1.581139 | 1.676245 |
| 1 | -0.893586 | 1.349603 | -0.260882 | 1.404572 | -0.502808 | -1.159273 | 1.274785 | 0.632456 | -0.963297 |
| 2 | -0.893586 | -0.740959 | 0.913196 | -1.264598 | -0.502808 | 0.796425 | 0.334530 | -1.581139 | 0.280110 |
| 3 | -0.893586 | -0.740959 | -1.176114 | 0.069987 | -0.502808 | -0.898513 | -1.289547 | 0.632456 | -0.920135 |
| 4 | 0.571657 | -0.740959 | 0.294307 | 0.069987 | -0.502808 | 0.535666 | -1.033114 | 0.632456 | -0.307562 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2207 | -0.893586 | -0.740959 | 0.430444 | 0.069987 | -0.502808 | 0.405286 | 0.163575 | 0.632456 | 1.218061 |
| 2208 | 0.571657 | -0.740959 | 0.560123 | 2.739158 | 0.617244 | 0.926805 | 1.958607 | 0.632456 | -0.271040 |
| 2209 | -0.893586 | 1.349603 | 0.233347 | -1.264598 | 0.617244 | 0.535666 | -1.033114 | -1.581139 | 1.052052 |
| 2210 | 0.571657 | -0.740959 | 0.803172 | 0.069987 | -0.502808 | 1.057185 | 1.103830 | 0.632456 | 0.391336 |
| 2211 | 0.571657 | -0.740959 | 0.042290 | 1.404572 | 0.617244 | -0.507373 | 1.274785 | 0.632456 | -0.722584 |
2212 rows × 9 columns
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
model = KMeans()
visualizer = KElbowVisualizer(model, k=(1,11))
visualizer.fit(scaled_df)
visualizer.show();
kmeans = KMeans(n_clusters=3,init='k-means++',random_state=0)
pred_kmeans = kmeans.fit_predict(scaled_df)
df['pred_kmeans'] = pred_kmeans+1
df.head()
| education | marital_status | income | recency | mntwines | mntfruits | mntmeatproducts | mntfishproducts | mntsweetproducts | mntgoldprods | total_kids | total_accep | total_purchase | age | isparent | spent | total_days | family | ageGroup | pred_kmeans | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Graduate | Single | 58138.0 | 58 | 635 | 88 | 546 | 172 | 88 | 88 | 0 | 1 | 25 | 57 | False | 1617 | 663 | 1 | Middle Age Adult | 1 |
| 1 | Graduate | Single | 46344.0 | 38 | 11 | 1 | 6 | 2 | 1 | 6 | 2 | 0 | 6 | 60 | True | 27 | 113 | 3 | NaN | 2 |
| 2 | Graduate | Married | 71613.0 | 26 | 426 | 49 | 127 | 111 | 21 | 42 | 0 | 0 | 21 | 49 | False | 776 | 312 | 2 | Middle Age Adult | 1 |
| 3 | Graduate | Married | 26646.0 | 26 | 11 | 4 | 20 | 10 | 3 | 5 | 1 | 0 | 8 | 30 | True | 53 | 139 | 3 | Adult | 2 |
| 4 | Postgraduate | Married | 58293.0 | 94 | 173 | 43 | 118 | 46 | 27 | 15 | 1 | 0 | 19 | 33 | True | 422 | 161 | 3 | Adult | 3 |
df.isnull().sum()
education 0 marital_status 0 income 0 recency 0 mntwines 0 mntfruits 0 mntmeatproducts 0 mntfishproducts 0 mntsweetproducts 0 mntgoldprods 0 total_kids 0 total_accep 0 total_purchase 0 age 0 isparent 0 spent 0 total_days 0 family 0 ageGroup 49 pred_kmeans 0 dtype: int64
df1['pred_kmeans'] = pred_kmeans+1
df1.head()
| education | marital_status | income | total_kids | total_accep | total_purchase | age | isparent | spent | pred_kmeans | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1 | 58138.0 | 0 | 1 | 25 | 57 | False | 1617 | 1 |
| 1 | 0 | 1 | 46344.0 | 2 | 0 | 6 | 60 | True | 27 | 2 |
| 2 | 0 | 0 | 71613.0 | 0 | 0 | 21 | 49 | False | 776 | 1 |
| 3 | 0 | 0 | 26646.0 | 1 | 0 | 8 | 30 | True | 53 | 2 |
| 4 | 1 | 0 | 58293.0 | 1 | 0 | 19 | 33 | True | 422 | 3 |
personal = df[['education','marital_status','total_kids','total_accep','family','isparent']]
fig,ax = plt.subplots(2,3,figsize=(20,10))
sns.set_style('darkgrid')
for variable, subplot in zip(personal,ax.flatten()):
sns.countplot(df[variable],hue=df['pred_kmeans'],ax=subplot,palette='cool')
other = df[['income','total_purchase','age','spent','total_days','recency']]
fig,ax = plt.subplots(2,3,figsize=(20,10))
sns.set_style('darkgrid')
for variable, subplot in zip(other,ax.flatten()):
sns.kdeplot(data= df,x=variable,hue='pred_kmeans',ax=subplot,palette='cool')
plt.figure(figsize=(20,10))
sns.scatterplot(data=df, x='income', y='spent', hue='pred_kmeans' ,palette='cool');
plt.xlabel('Income', fontsize=20, labelpad=20)
plt.ylabel('Total Spendings', fontsize=20, labelpad=20);
cluster_spendings = df.groupby('pred_kmeans')[['mntwines', 'mntfruits','mntmeatproducts',
'mntfishproducts', 'mntsweetproducts', 'mntgoldprods']].sum()
cluster_spendings.plot(kind='bar', stacked=True, figsize=(9,7), color=['#dc4c4c','#e17070','#157394','#589cb4','#bcb4ac','#3c444c'])
plt.title('Spending Habits by Cluster')
plt.xlabel('Clusters', fontsize=20, labelpad=20)
plt.ylabel('Spendings', fontsize=20, labelpad=20);
plt.xticks(rotation=0, ha='center');
fig = px.scatter(df1, x="total_purchase", y="income", color="pred_kmeans",
size="spent", size_max=45, log_x=True)
fig.update_layout(legend=dict(
yanchor="top",
y=0.99,
xanchor="left",
x=0.01
))
fig.show()
for i in other:
plt.figure()
sns.jointplot(x=df[i], y=df["spent"], hue =df["pred_kmeans"], kind="kde")
plt.show()
--------------------------------------------------------------------------- LinAlgError Traceback (most recent call last) <ipython-input-99-aa2cc0deb072> in <module> 1 for i in other: 2 plt.figure() ----> 3 sns.jointplot(x=df[i], y=df["spent"], hue =df["pred_kmeans"], kind="kde") 4 plt.show() ~\anaconda3\lib\site-packages\seaborn\_decorators.py in inner_f(*args, **kwargs) 44 ) 45 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)}) ---> 46 return f(**kwargs) 47 return inner_f 48 ~\anaconda3\lib\site-packages\seaborn\axisgrid.py in jointplot(x, y, data, kind, color, height, ratio, space, dropna, xlim, ylim, marginal_ticks, joint_kws, marginal_kws, hue, palette, hue_order, hue_norm, **kwargs) 2106 2107 joint_kws.setdefault("color", color) -> 2108 grid.plot_joint(kdeplot, **joint_kws) 2109 2110 marginal_kws.setdefault("color", color) ~\anaconda3\lib\site-packages\seaborn\axisgrid.py in plot_joint(self, func, **kwargs) 1690 1691 if str(func.__module__).startswith("seaborn"): -> 1692 func(x=self.x, y=self.y, **kwargs) 1693 else: 1694 func(self.x, self.y, **kwargs) ~\anaconda3\lib\site-packages\seaborn\_decorators.py in inner_f(*args, **kwargs) 44 ) 45 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)}) ---> 46 return f(**kwargs) 47 return inner_f 48 ~\anaconda3\lib\site-packages\seaborn\distributions.py in kdeplot(x, y, shade, vertical, kernel, bw, gridsize, cut, clip, legend, cumulative, shade_lowest, cbar, cbar_ax, cbar_kws, ax, weights, hue, palette, hue_order, hue_norm, multiple, common_norm, common_grid, levels, thresh, bw_method, bw_adjust, log_scale, color, fill, data, data2, **kwargs) 1740 else: 1741 -> 1742 p.plot_bivariate_density( 1743 common_norm=common_norm, 1744 fill=fill, ~\anaconda3\lib\site-packages\seaborn\distributions.py in plot_bivariate_density(self, common_norm, fill, levels, thresh, color, legend, cbar, cbar_ax, cbar_kws, estimate_kws, **contour_kws) 1080 # Estimate the density of observations at this level 1081 observations = observations["x"], observations["y"] -> 1082 density, support = estimator(*observations, weights=weights) 1083 1084 # Transform the support grid back to the original scale ~\anaconda3\lib\site-packages\seaborn\_statistics.py in __call__(self, x1, x2, weights) 187 return self._eval_univariate(x1, weights) 188 else: --> 189 return self._eval_bivariate(x1, x2, weights) 190 191 ~\anaconda3\lib\site-packages\seaborn\_statistics.py in _eval_bivariate(self, x1, x2, weights) 162 support = self.support 163 if support is None: --> 164 support = self.define_support(x1, x2, cache=False) 165 166 kde = self._fit([x1, x2], weights) ~\anaconda3\lib\site-packages\seaborn\_statistics.py in define_support(self, x1, x2, weights, cache) 119 support = self._define_support_univariate(x1, weights) 120 else: --> 121 support = self._define_support_bivariate(x1, x2, weights) 122 123 if cache: ~\anaconda3\lib\site-packages\seaborn\_statistics.py in _define_support_bivariate(self, x1, x2, weights) 102 clip = (clip, clip) 103 --> 104 kde = self._fit([x1, x2], weights) 105 bw = np.sqrt(np.diag(kde.covariance).squeeze()) 106 ~\anaconda3\lib\site-packages\seaborn\_statistics.py in _fit(self, fit_data, weights) 135 fit_kws["weights"] = weights 136 --> 137 kde = stats.gaussian_kde(fit_data, **fit_kws) 138 kde.set_bandwidth(kde.factor * self.bw_adjust) 139 ~\anaconda3\lib\site-packages\scipy\stats\kde.py in __init__(self, dataset, bw_method, weights) 204 self._neff = 1/sum(self._weights**2) 205 --> 206 self.set_bandwidth(bw_method=bw_method) 207 208 def evaluate(self, points): ~\anaconda3\lib\site-packages\scipy\stats\kde.py in set_bandwidth(self, bw_method) 554 raise ValueError(msg) 555 --> 556 self._compute_covariance() 557 558 def _compute_covariance(self): ~\anaconda3\lib\site-packages\scipy\stats\kde.py in _compute_covariance(self) 566 bias=False, 567 aweights=self.weights)) --> 568 self._data_inv_cov = linalg.inv(self._data_covariance) 569 570 self.covariance = self._data_covariance * self.factor**2 ~\anaconda3\lib\site-packages\scipy\linalg\basic.py in inv(a, overwrite_a, check_finite) 975 inv_a, info = getri(lu, piv, lwork=lwork, overwrite_lu=1) 976 if info > 0: --> 977 raise LinAlgError("singular matrix") 978 if info < 0: 979 raise ValueError('illegal value in %d-th argument of internal ' LinAlgError: singular matrix
<Figure size 576x396 with 0 Axes>
<Figure size 576x396 with 0 Axes>
<Figure size 576x396 with 0 Axes>
<Figure size 576x396 with 0 Axes>
sns.kdeplot(df.income,df.spent)
<AxesSubplot:xlabel='income', ylabel='spent'>
sns.kdeplot(df.recency,df.spent,hue =df["pred_kmeans"])
<AxesSubplot:xlabel='recency', ylabel='spent'>
sns.scatterplot(df.recency,df.total_purchase,hue =df["pred_kmeans"],palette='cool')
<AxesSubplot:xlabel='recency', ylabel='total_purchase'>
sns.scatterplot(df.recency,df.spent,hue =df["pred_kmeans"],palette='cool')
<AxesSubplot:xlabel='recency', ylabel='spent'>
sns.scatterplot(df.recency,df.income,hue =df["pred_kmeans"],palette='cool')
<AxesSubplot:xlabel='recency', ylabel='income'>
palette='cool'
plt.figure(figsize=(20,10))
sns.scatterplot(data=df, x='pred_kmeans', y='spent', hue='pred_kmeans' ,palette='cool');
plt.xlabel('Income', fontsize=20, labelpad=20)
plt.ylabel('Total Spendings', fontsize=20, labelpad=20);